Load R packages

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(rstatix)
library(expss)
library(DESeq2)
library(knitr) ; library(kableExtra)

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gupta dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
datMeta = datMeta %>% dplyr::rename(Batch = RNAExtractionBatch)

# Updates genes_info with SFARI information
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>%
             mutate(gene.score = ifelse(is.na(`gene-score`) & Neuronal==0, 'Others', 
                                        ifelse(is.na(`gene-score`), 'Neuronal', `gene-score`))) %>%
             mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'), 
                    levels = c('SFARI', 'Neuronal', 'Others')))

# SFARI palette
SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}




There are 767 SFARI Genes in the expression dataset (~69%), 692 of them have a SFARI Gene Score.

Gene count by SFARI score:

table_info = genes_info %>% apply_labels(`gene-score` = 'SFARI Gene Score', syndromic = 'Syndromic Tag',
                                          Neuronal = 'Neuronal Function', gene.score = 'Gene Score') %>%
             mutate(syndromic = as.logical(syndromic), Neuronal = as.logical(Neuronal))

cro(table_info$`gene-score`)
 #Total 
 SFARI Gene Score 
   1  141
   2  185
   3  366
   #Total cases  692


Gene count by Syndromic tag:

cro(table_info$syndromic)
 #Total 
 Syndromic Tag 
   FALSE  651
   TRUE  116
   #Total cases  767


Neuronal annotations:


1360 genes have neuronal-related annotations, 188 of these, have a SFARI score

cro(table_info$gene.score[genes_info$`gene-score` %in% as.character(c(1:3))],
    list(table_info$Neuronal[genes_info$`gene-score` %in% as.character(c(1:3))], total()))
 Neuronal Function     #Total 
 FALSE   TRUE   
 Gene Score 
   1  95 46   141
   2  130 55   185
   3  279 87   366
   #Total cases  504 188   692
rm(table_info)




2.4.1 Analysis of all SFARI Genes together


Mean Expression

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
plot_data %>% ggplot(aes(Group, MeanExpr)) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + #ggtitle('Mean Expression Comparison') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

Log Fold Change

The propotion of over and underexpressed genes is very simiar for each group, so we can just focus on the magnitude of the LFC independently of the sign.

genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>% 
               group_by(Group, direction) %>% tally(name = 'overexpressed') %>% 
               filter(direction == 'overexpressed') %>% ungroup %>% 
               left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
               mutate('underexpressed' = Total - overexpressed , 
                      'ratio' = round(overexpressed/underexpressed,2)) %>% 
               dplyr::select(Group, overexpressed, underexpressed, ratio) %>% 
               kable %>% kable_styling(full_width = F)
Group overexpressed underexpressed ratio
SFARI 380 312 1.22
Neuronal 615 557 1.10
Others 6006 5286 1.14


plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% filter(log2FoldChange>0) %>% 
     wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(Group, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

wt = plot_data %>% filter(log2FoldChange<0) %>% 
     wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(Group, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow=1)


The difference in LFC magnitude between SFARI genes and Neuronal genes is visible but nolonger statistically significant. The only significant difference is found between Neuronal genes and non-SFARI, non-Neuronal genes when comparing the shrunken version of the LFC magnitudes.

wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange), 
                           Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.04
base = 0.55
pos_y_comparisons = c(base, base + increase, base)
p1 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')


wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange), 
                           Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.03
base = 0.37
pos_y_comparisons = c(base, base + increase, base)
p2 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow = 1)

rm(increase, base, pos_y_comparisons, wt)

Full plots, without cropping out outliers

p1 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
  
p2 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
  

subplot(p1, p2, nrows = 1)


Differential Expression

genes_info %>% group_by(Group, significant) %>% tally(name = 'DE') %>% filter(significant) %>% ungroup %>%
               left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
               mutate('prop_DE' = round(DE/Total,2)) %>% dplyr::select(Group, DE, prop_DE, Total) %>% 
               kable(caption = 'Proportion of DE Genes by Group') %>% kable_styling(full_width = F)
Proportion of DE Genes by Group
Group DE prop_DE Total
SFARI 29 0.04 692
Neuronal 63 0.05 1172
Others 368 0.03 11292


lfc_list = seq(0, 0.2, 0.01)

all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$Group == 'Neuronal')))
lfc_counts_all = genes_info %>% filter(Group == 'SFARI') %>% tally %>%
                 mutate('group'='SFARI', 'n'=as.character(n)) %>% 
                 dplyr::select(group, n) %>%
                 bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
                 mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)

for(lfc in lfc_list){
  
  # Recalculate genes_info with the new threshold (p-values change)
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>% 
             mutate('ID' = rownames(.)) %>% 
             left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score, Group), by = 'ID') %>% 
             filter(padj<0.05 & abs(log2FoldChange)>lfc)

  
  # Calculate counts by groups
  all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
  Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$Group == 'Others')))
  Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Group == 'Neuronal')))
  lfc_counts = DE_genes %>% filter(Group == 'SFARI') %>% tally %>%
               mutate('group'='SFARI', 'n'=as.character(n)) %>%
               bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
               mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
  
  
  # Update lfc_counts_all
  lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}

# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>% 
  left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)

# Calculate percentage of each group remaining
tot_counts = genes_info %>% filter(Group == 'SFARI') %>% tally() %>%
             mutate('group'='SFARI', 'tot'=n) %>% dplyr::select(group, tot) %>%
             bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$Group == 'Neuronal')),
                       data.frame('group' = 'Others', 'tot' = sum(genes_info$Group == 'Others')),
                       data.frame('group'='All', 'tot'=nrow(genes_info)))

lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>% 
                 left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))


# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>% 
         mutate(group = factor(group, levels = c('SFARI', 'Neuronal', 'Others'))) %>%
         ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
         scale_color_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
         ylab('Percentage of DE Genes') +  xlab('LFC threshold') + labs(color = 'Group') + 
         ggtitle('Effect of filtering thresholds in SFARI Genes') + 
         theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Others_counts, Neuronal_counts, lfc_counts, lfc_counts_all, DE_genes, tot_counts)




2.4.2 Analysis of SFARI Genes by SFARI Score


Mean Expression

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% left_join(genes_info, by='ID')

wt = plot_data %>% wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.5
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
plot_data %>% ggplot(aes(gene.score, MeanExpr)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
              stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              xlab('') + ylab('Mean Expression') + 
              scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
              theme_minimal() + theme(legend.position='none')

Log Fold Change

genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>% 
               group_by(gene.score, direction) %>% tally(name = 'overexpressed') %>% 
               filter(direction == 'overexpressed') %>% ungroup %>% 
               left_join(genes_info %>% group_by(gene.score) %>% tally(name = 'Total'), by = 'gene.score') %>% ungroup %>%
               mutate('underexpressed' = Total - overexpressed , 
                      'ratio' = round(overexpressed/underexpressed,2)) %>% 
               dplyr::select(gene.score, overexpressed, underexpressed, ratio) %>% 
               kable %>% kable_styling(full_width = F)
gene.score overexpressed underexpressed ratio
1 76 65 1.17
2 97 88 1.10
3 207 159 1.30
Neuronal 615 557 1.10
Others 6006 5286 1.14


plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% filter(log2FoldChange>0) %>% 
     wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.6
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(gene.score, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .02) +
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

wt = plot_data %>% filter(log2FoldChange<0) %>% 
     wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(gene.score, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .02) +
      scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow=1)


The only statistically significant difference between groups is Neuronal genes and non-Neuronal non-SFARI genes when comparing the magnitude of the shrunken LFC values.

wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange), 
                           Group = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.04
base = 0.55
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')


wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange), 
                           Group = factor(gene.score, levels = c('1','2','3','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.03
base = 0.38
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow = 1)

rm(increase, base, pos_y_comparisons, wt)


Full plots, without cropping out outliers

p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')

p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

subplot(ggplotly(p1), ggplotly(p2), nrows=1)


Differential Expression


lfc_list = seq(0, 0.2, 0.01)

all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$gene.score == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$gene.score == 'Neuronal')))
lfc_counts_all = genes_info %>% group_by(`gene-score`) %>% filter(`gene-score` != 'Others') %>% tally %>%
                 mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
                 dplyr::select(group, n) %>%
                 bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
                 mutate('lfc'=-1) %>%  dplyr::select(lfc, group, n)

for(lfc in lfc_list){
  
  # Recalculate genes_info with the new threshold (p-values change)
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% 
             left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score), by='ID') %>% 
             filter(padj<0.05 & abs(log2FoldChange)>lfc)

  
  # Calculate counts by groups
  all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
  Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$gene.score == 'Others')))
  Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$gene.score == 'Neuronal')))
  lfc_counts = DE_genes %>% group_by(gene.score) %>% tally %>%
               mutate('group'=gene.score, 'n'=as.character(n)) %>%
               bind_rows(Neuronal_counts, all_counts) %>%
               mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
  
  
  # Update lfc_counts_all
  lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}

# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>% 
  left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)

# Calculate percentage of each group remaining
tot_counts = genes_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='Others') %>%
             mutate('group'=factor(`gene-score`), 'tot'=n) %>% dplyr::select(group, tot) %>%
             bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$gene.score == 'Neuronal')),
                       data.frame('group'='Others', 'tot'=sum(genes_info$gene.score == 'Others')),
                       data.frame('group'='All', 'tot'=nrow(genes_info)))

lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>% 
                 left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))


# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>% 
         mutate(group = ifelse(group %in% c('1','2','3'), paste0('Score ',group), group)) %>% 
         mutate(group = factor(group, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 'Others'))) %>%
         ggplot(aes(lfc, perc, color=group)) + 
         geom_point(aes(id=n)) + geom_line() + scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
         ylab('Percentage of DE Genes') +  xlab('LFC threshold') +  labs(color = 'Group') +
         #ggtitle('Effect of filtering thresholds by SFARI score') + 
         #guides(color=guide_legend(nrow=2,byrow=TRUE) + # for breaking the legend into two rows
         theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts,
   lfc_counts_all, Others_counts)

Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0            knitr_1.32                 
##  [3] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [7] matrixStats_0.56.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.3              S4Vectors_0.22.1           
## [13] BiocGenerics_0.30.0         expss_0.10.2               
## [15] rstatix_0.6.0               ggpubr_0.2.5               
## [17] magrittr_2.0.1              GGally_1.5.0               
## [19] gridExtra_2.3               viridis_0.5.1              
## [21] viridisLite_0.4.0           RColorBrewer_1.1-2         
## [23] plotly_4.9.2                glue_1.4.2                 
## [25] reshape2_1.4.4              forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_1.0.1                
## [29] purrr_0.3.4                 readr_1.3.1                
## [31] tidyr_1.1.0                 tibble_3.1.2               
## [33] ggplot2_3.3.5               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] crosstalk_1.1.0.1      digest_0.6.27          htmltools_0.5.1.1     
##  [10] fansi_0.5.0            checkmate_2.0.0        memoise_1.1.0         
##  [13] cluster_2.1.0          openxlsx_4.1.4         annotate_1.62.0       
##  [16] modelr_0.1.6           jpeg_0.1-8.1           colorspace_2.0-2      
##  [19] blob_1.2.1             rvest_0.3.5            haven_2.2.0           
##  [22] xfun_0.22              crayon_1.4.1           RCurl_1.98-1.2        
##  [25] jsonlite_1.7.2         genefilter_1.66.0      survival_3.2-7        
##  [28] gtable_0.3.0           zlibbioc_1.30.0        XVector_0.24.0        
##  [31] webshot_0.5.2          car_3.0-7              abind_1.4-5           
##  [34] scales_1.1.1           DBI_1.1.0              Rcpp_1.0.6            
##  [37] xtable_1.8-4           htmlTable_1.13.3       foreign_0.8-76        
##  [40] bit_4.0.4              Formula_1.2-3          htmlwidgets_1.5.1     
##  [43] httr_1.4.2             acepack_1.4.1          ellipsis_0.3.2        
##  [46] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [49] farver_2.1.0           nnet_7.3-14            sass_0.4.0            
##  [52] dbplyr_1.4.2           locfit_1.5-9.4         utf8_1.2.1            
##  [55] tidyselect_1.1.0       labeling_0.4.2         rlang_0.4.11          
##  [58] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [61] tools_3.6.3            cli_2.5.0              generics_0.0.2        
##  [64] RSQLite_2.2.0          broom_0.7.0            evaluate_0.14         
##  [67] yaml_2.2.1             bit64_4.0.5            fs_1.5.0              
##  [70] zip_2.0.4              xml2_1.2.5             compiler_3.6.3        
##  [73] rstudioapi_0.11        curl_4.3               png_0.1-7             
##  [76] ggsignif_0.6.0         reprex_0.3.0           geneplotter_1.62.0    
##  [79] bslib_0.2.5.1          stringi_1.5.3          highr_0.8             
##  [82] lattice_0.20-41        Matrix_1.2-18          vctrs_0.3.8           
##  [85] pillar_1.6.1           lifecycle_1.0.0        jquerylib_0.1.4       
##  [88] data.table_1.12.8      bitops_1.0-6           R6_2.5.0              
##  [91] latticeExtra_0.6-29    rio_0.5.16             assertthat_0.2.1      
##  [94] withr_2.4.2            GenomeInfoDbData_1.2.1 hms_0.5.3             
##  [97] grid_3.6.3             rpart_4.1-15           rmarkdown_2.7         
## [100] carData_3.0-3          lubridate_1.7.4        base64enc_0.1-3